#!/usr/bin/python
# -*- coding: cp1251 -*-
r"""
NAME:
    pyDSL2HTM v. 0.21; last update: 07-Jan-2008

AUTHOR:
    zhuman 2012

DESCRIPTION:
    Parse Lingvo DSL source files (in Unicode UCS2-LE or ANSI
    encodings and:
    1. Detect entry and field boundaries and insert markers:
    '<!--~-->' = as entry delimiter
    '<!--=-->' = as field delimiter (separates headword and definition)

    2. Convert DSL markup to HTML, according to convresion tables
    3. Save resulting file to UTF-8 or other supported encodings.

    There are two output modes:

        -p = Pretty HTML (with HTML HEADER and styles).
             This mode is the default; '-p' may be omitted.

        -r = Raw HTML (for database import)
             Note: In the raw HTML mode the <DT> tag is not prepended to the definition.

    !!!TODO Add [s] tag for links [/s]

    Copyright (C) 2007 xhuman <xh@ua.fm>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by the
    Free Software Foundation; either version 2, or any later version.

    This program is distributed in the hope that it will be useful, but
    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY
    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    for more details.


"""

import sys
import os
import re
import codecs
import time

"""
Configration settings for pyDSL2HTM
"""
# Separators for entries and fields (for SQL import)
ENTRY_SEPARATOR = os.linesep + '<!--~-->'

FIELD_SEPARATOR = '<!--=-->'

# Additional separators for entries and fields
# used for pretty HTML file mode
ADDITIONAL_ENTRY_SEPARATOR_PRETTY = '<DT>'
ADDITIONAL_FIELD_SEPARATOR_PRETTY = ''

# Separator for line breaks within the definition
NEW_LINE_MARKER = '<dd>'

# Strings to replace in HEADWORD only
HEADWORD_REPL = {
u'{\u2027}':'',  # for Collins 2006
r"{[']}":'',     # remove accents until there are some better ideas
r"{[/']}":'',
'\\':''
}


# Strings to replace on first pass
DSL2HTM_DICT_1ST_PASS = {
'<<':'[ref]',
'>>':'[/ref]',

}

# Strings to replace on second pass
DSL2HTM_DICT_2ND_PASS = {
'>':'&gt;',
'<':'&lt;',
r'[!trs]':'',
r'[/!trs]':''
}


# Strings to replace on third pass
DSL2HTM_DICT_3RD_PASS = {
    # opening tags to SPANs
    re.compile('\[(m|c|trn|t|com|p|ex|ref)\]'):'<span class="\\1">',
    re.compile('\[\*\]'):'<span class="star">',
    re.compile('\[\'\]'):'<span class="stress">',
    re.compile('(\[ref\s[^\]]+\]|<<)'):'<span class="ref">',
    re.compile('>>'):'</span>',

    # closing SPANs
    re.compile('\[/(?:c|m|trn|t|com|p|ex|ref|\'|\*|m)\]'):'</span>',

    #COLOR
    re.compile('\[c ([\w]+)\]'):'<span class="col_\\1">',

    # TO SELF
    re.compile('\[(sup|sub|u|i|b)\]'):'<\\1>',
    re.compile('\[/(sup|sub|u|i|b)\]'):'</\\1>',

    # Indentation (m1, m2, m3, m4...)
    re.compile('\[m(\d)\]'):'<span class="m\\1">',


    # EMPTY TAGS (remove)
    re.compile('\[/?(?:trn|lang)\]'):'',
    re.compile('\[lang\s[^\]]*\]'):'',


    # Remove escapes
    re.compile(r'\\'):'',

    # Dictionary specific
    re.compile('Trimis de.*Sursa:\s*' ): ''  # rodex
    # re.compile(u'{\u2027}'):''               # collins 2006
    #'\t':'',
}

# HTML and CSS for pretty HTML output
HTML_HEADER = r'''<HTML>
<HEAD>
 <TITLE></TITLE>
 <META HTTP-EQUIV=Content-Type CONTENT="text/html; charset=%s" />
 <style type="text/css">
     body {padding: 1px}
      p, body, table {font-family: "Arial Unicode MS", verdana, arial; font-size: 9pt;}
      table { border: 1px solid #cdc1ae; border-collapse: collapse; padding: 0px; margin: 1px;}
      TH, {border: 1px solid #cdc1ae;  }
      DT {text-align: left; border: 1px solid white; border-bottom: 1px solid #d5cdbf; border-right: 1px solid #e5dfce; color: #817e70;margin: 2px 80px ; padding: 5px 10px; background: #eff4e5; font-weight: bold}
      DD {text-align: left; margin: 0px 80px ; padding: 0px 110px; background: #f9f9fb; vertical-align: }
      dd p {margin: 0px; padding: 1px}
      TD {border: 1px solid #cdc1ae; padding: 2px;vertical-align: top;}
      TR { padding: 2px;background:#fbf8e9 ; }
      h1 {font-size: 13pt; color: #999999}
      pre {font-family: "Lucida Console", "Lucida Sans Unicode"}
      A:link{Color : #00008B; Font-Family : Arial, Helvetica, sans-serif;Text-Decoration : none; }
      A:visited{Color : #00008B;Font-Family : Arial, Helvetica, sans-serif;Text-Decoration : none;}
      A:hover{Color : #A0522D;Font-Family : Arial, Helvetica, sans-serif;Text-Decoration : none;}
      table.search {border: 0px; background: #e8e7d8 }
      table.search td {text-align: right; border: 0px}
      .info {color: #bbbbbb; padding: 3px}
      .even {background-color: #f5f0df}
      input {border:1px solid #a3a3a3;background: #fffbd5; margin: 5px}
      select {background-color: #f9f2f2; color: Olive; border:1px solid #d6d1c0;}
      .button {border:1px solid #d6d1c0; background: #e8e7d8 }
      .dbg {color:#eaebec}
      .prev{display: inline; float: left; text-align:left; width: 100px; }
      .next{display: inline; float: right; text-align: right; width: 100px; }
      .star {color:#395666}
      .ref{font-weight: bold; color: #009999}
      .m1{margin-left: 0px;}
      .m2{margin-left: 5px;}
      .m3{margin-left: 10px;}
      .m4{margin-left: 14px;}
      .m1{margin-left: 0px;}
      .m2{margin-left: 5px;}
      .m3{margin-left: 10px;}
      .m4{margin-left: 14px;}
      .m2 b i {color: #996300}
      .m3 b {color: darkblue}
      .p1 {color: #333b14}
      .col1{color: #660066}
      .col2{color: #006600}
      .col3{color: #857d49}
      .col4 {color: gray}
      .m3 .col4{color: darkgreen}
      .ex {color: #000099; font-family: "Trebuchet MS", "Times New Roman"}
      .com{color: darkred}
 </style>
</HEAD>
<BODY BGCOLOR="#FFFFFF" TET="#000000" LINK="#000099" VLINK="#330066" ALINK="#FF0000">
<DL>
'''

HTML_FOOTER =r'''
</DL>
</BODY>
</HTML>
'''
# ################################################################
# ################################################################
# ###################  END OF CONFIG SECTION  ####################
# ################################################################
# ################################################################


# DEFAULT ENCODINGS (detected at run-time, fall back to cp1251 on non-Unicode)
INPUT_ENCODING  = ''
OUTPUT_ENCODING = ''
PRETTY_HTML = True
DIRTY_CONVERSION = False

def Usage():
    print '''USAGE:
    %s [-i Encoding] [-o Encoding] [-p] [-r] <Dictionary_Unicode.DSL> [Dictionary_converted.TxT]
    Where:
    -i, --input-encoding = input file encoding
    -o, --output-encoding = output file encoding
    -p, --pretty  = add HTML header and footer (default)
    -r, --raw     = don't add HTML header and footer

    Possible encodings:
    utf8        8-bit variable length encoding
    utf_16      16-bit variable length encoding (little/big endian)
    utf_16_le   utf-16 but explicitly little endian
    utf_16_be   utf-16 but explicitly big endian
    cp1251      Windows Cyrillic
    cp866       OEM Cyrillic (DOS)
    ascii       7-bit ASCII codepage
    iso-8859-1  ISO 8859-1 (Latin 1) codepage
    For a complete list of encodings see:
    http://docs.python.org/lib/standard-encodings.html

EXAMPLES:

    pyDSL2HTM oxford.dsl
    converts a Lingvo DSL Unicode file to HTML (UTF-8)

    pyDSL2HTM -o 1251 oxford.dsl
    converts a Lingvo DSL Unicode file to HTML (Windows-1251)''' % os.path.split(sys.argv[0])[-1]
    sys.exit(-1)

def parseCommandLine():
    '''process command line parameters'''
    import getopt
    global INPUT_ENCODING, OUTPUT_ENCODING, PRETTY_HTML
    try:
        opts, args = getopt.getopt ( sys.argv[1:], "i:o:hpr",
        ["input-encoding=",
        "output-encoding=",
        "help",
        "pretty",
        "raw"
        ]
        )
    except getopt.GetoptError:
        # print help information and exit
        usage()
        sys.exit(2)
    #output = None
    #verbose = False
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            Usage()
            #sys.exit()
        if opt in ("-i", "--input-encoding"):
            INPUT_ENCODING = arg
        if opt in ("-o", "--output-encoding"):
            OUTPUT_ENCODING = arg
        if opt in ("-p", "--pretty"):
            PRETTY_HTML = True
        if opt in ("-r", "--raw"):
            PRETTY_HTML = False
    #print '''DBG: opts=%s\nargs=%s''' % (opts, args)
    return args

def calc_eta(start, now):
	eta = now - start
	eta_mins = eta / 60
	eta_secs = eta % 60
	if eta_mins > 99:
		return '--:--'
	return '%02d:%02d' % (eta_mins, eta_secs)

def isFileUnicode(file_name, BOMbyte='\xFF'):
    '''Detect if current file is Unicode by presence of BOM or nulls'''
    f = open(file_name, 'rb')
    BOM_found = f.read(1) == BOMbyte and True or False
    f.close()
    return BOM_found

def detectLineTerminator(file_name):
    f = open(file_name, 'rb')
    line = f.readline()
    LINE_TERMINATOR =""
    if '\r' in line:
        LINE_TERMINATOR += '\r'
    if '\n' in line:
        LINE_TERMINATOR += '\n'
    f.close()
    return LINE_TERMINATOR


def str_replace( text, dic ):
    keys = dic.keys()
    keys.sort()
    for n in keys:
        text = text.replace(n,dic[n])
    return text

def str_replace_re( text, dic ):
    """do multiple replace in text according
    to settings defined in dic"""

    for pat, repl in dic.iteritems():
       text = re.sub(pat, repl, text) #  text.replace(n,dic[n])
    return text

##def multiple_replace(text, dic):
##
##  """ Replace in 'text' all occurences of any key in the given
##  dictionary by its corresponding value.  Returns the new string."""
##
##  # Create a regular expression  from the dictionary keys
##  regex = re.compile("(%s)" % "|".join(map(re.escape, dic.keys())))
##  #regex = re.compile("(%s)" % "|".join( dic.keys()))
##  # For each match, look-up corresponding value in dictionary
##  return regex.sub(lambda mo: dic[mo.string[mo.start():mo.end()]], text)
##
### data = codecs.open (, 'r', 'utf_16_le').read()

def do_DSL2HTML_conv(data):

    # 1st pass replacement...'
    data = str_replace(data, DSL2HTM_DICT_1ST_PASS)

    # 2nd pass replacement...'
    data = str_replace(data, DSL2HTM_DICT_2ND_PASS)

    # 3rd pass replacement... (using regex version)'
    data = str_replace_re(data, DSL2HTM_DICT_3RD_PASS)

    return data

def write_entry(ofile, headWord, definition):
    'Write an HTML formatted entry to output file'

    definition = do_DSL2HTML_conv(definition)


    definition = definition.replace(END_OF_LINE, "\n" + NEW_LINE_MARKER)


    # headWord.replace   (u'{\u2027}', '')
    headWord = str_replace(headWord, HEADWORD_REPL)

    # remove escapes in headwords
    if '\\' in headWord:
        headWord = headWord.replace('\\', '')
    # _________________________________________________________________________
    # .........................................................................
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #       <!--~-->          aadvark    <!--=-->          <dd>              a badger-sized African mammal
    # .........................................................................
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    entry = ENTRY_SEPARATOR + headWord + FIELD_SEPARATOR + NEW_LINE_MARKER + definition
    # .........................................................................
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


    try:
        ofile.write(entry.encode(OUTPUT_ENCODING))
    except UnicodeEncodeError, e:
        print e, "Please use '-o utf-8'!"
        DIRTY_CONVERSION = True

# #######################################################
# #####################    MAIN    ######################
# #######################################################


args = parseCommandLine()

if len(args) == 0:
    Usage()

if len(args) > 0:
    input_file = args[0]

END_OF_LINE = detectLineTerminator(input_file)

if PRETTY_HTML:
    ENTRY_SEPARATOR += ADDITIONAL_ENTRY_SEPARATOR_PRETTY
    FIELD_SEPARATOR += ADDITIONAL_FIELD_SEPARATOR_PRETTY


if not INPUT_ENCODING or not OUTPUT_ENCODING:

    if isFileUnicode(input_file) or isFileUnicode(input_file, '\x00'): # Unicode UCS2 little endian \xFF
        INPUT_ENCODING  = 'utf_16'
        if not OUTPUT_ENCODING:
            OUTPUT_ENCODING = 'utf-8'

    elif isFileUnicode(input_file, '\xEF'):  # UTF-8? '\xEF'
        INPUT_ENCODING  = 'utf-8'
        if not OUTPUT_ENCODING:
            OUTPUT_ENCODING = 'utf-8'

    else:
        INPUT_ENCODING  = 'cp1251'
        if not OUTPUT_ENCODING:
            OUTPUT_ENCODING = 'cp1251'

#print 'DBG: INPUT_ENCODING  = %s' % INPUT_ENCODING
#print 'DBG: OUTPUT_ENCODING = %s' % OUTPUT_ENCODING

if len(args) >1:
    out_file_name = args[1]

else:
    out_file_name = os.path.splitext (os.path.split(input_file)[-1])[0] \
    + '_'+OUTPUT_ENCODING + '.HTM'

if os.path.exists(out_file_name):

    if raw_input("\rFile "+out_file_name+" already exists! \nOverwrite? Y/N ").upper() == 'N':
        sys.exit(-2)
    else:
        os.sys.stdout.write("\n")


in_file = codecs.open (input_file, 'rb', encoding=INPUT_ENCODING)

print 'Input file:\t%s ("%s")' % (os.path.basename(input_file), INPUT_ENCODING)

## codecs.BOM_UTF16_LE:
###############################################

#out_file = codecs.open(out_file_name, 'wb', 'utf8')
out_file = open(out_file_name, 'wb')

if OUTPUT_ENCODING == 'utf-8':
    out_file.write( codecs.BOM_UTF8 )

if PRETTY_HTML:
    # Write HTML_HEADER and insert appropriate HTML charset
    out_file.write( (HTML_HEADER % OUTPUT_ENCODING).encode(OUTPUT_ENCODING) )


print 'Output file:\t%s ("%s")\n' % (out_file_name, OUTPUT_ENCODING)
#print "For big files conversion might take a long time...\nPlease wait..."
#print "Start time:\t" + time.strftime("%H:%M:%S", time.localtime())

#print 'Writing file:\n%s ["%s"]' % (os.path.abspath(out_file_name), OUTPUT_ENCODING)
#print "Please wait...\n"
#print "Processing started at:\t" + time.strftime("%H:%M:%S", time.localtime())

startTime = time.time()


##########################
# LINE-BASED IO
##########################

headWord = ""
nextHeadWord = ""
definition = ""
input_encoding='utf_16_le'
output_encoding='utf8'
endOfEntry = False
entryCount = 0

for line in in_file:

    line_stripped =line.strip()

    if not len(line_stripped):
        continue

    if line[0] == ("#"):
        continue

    if line[0] not in [" ","\t"]: # is a HEADWORD
        if not headWord:
            headWord = line_stripped

        else:  # reached the end of entry
            endOfEntry = True

            nextHeadWord = line_stripped # save next headword

    elif headWord:   # is a DEFINITION line
        definition += line_stripped + os.linesep

    if endOfEntry:
        write_entry(out_file, headWord, definition.strip())
        entryCount += 1
        definition =""
        headWord = nextHeadWord
        endOfEntry = False

        if not entryCount % 1024:
            sys.stdout.write ("\rEntries: %d \tElapsed time: %s" %
                              (entryCount,
                               calc_eta(startTime, time.time())
                              )
                             )


# write last entry
write_entry(out_file, headWord, definition.strip())

if PRETTY_HTML:
    out_file.write( HTML_FOOTER.encode(OUTPUT_ENCODING) )


if DIRTY_CONVERSION:
    print '''WARNING: Data was lost during converstion!
The selected output encoding "%s" does not support all the characters in the source file!'''

elapsedTime = "%02d:%02d" % divmod(int(time.time() - startTime), 60)

sys.stdout.write ("\rTotal entries processed: %d   Elapsed time: %s" % (entryCount, elapsedTime))

raw_input("\n\nConversion completed!\nPress <enter> to exit")


